### Code for scraping needed article info from the bulk download XML files
## Note: 
## 1. Please put this code file in your folder with those XML raw data. 
## Otherwise the code might not work.
## 2. Running this file requires high computer memory. The first three scraping
## requires at least 300 GB of RAM. The last MeSH tree number scraping can be ran 
## with 8 GB of RAM.

################################################################################
###      Extract all needed PubMed article info and 
###      write the result as the article_meta csv file
################################################################################

setwd("YOUR WORKING DIRECTORY")
require(XML)
require(methods)
require(dplyr)
require(data.table)

### Construct the scraping functions

# Custom function to extract abstract and title out of XML (since abstract and title can have multiple lines, I collapse them with a space character in the middle.)
xpath2 <-function(x, path, fun = xmlValue, ...){
  y <- xpathSApply(x, path, fun, ...)
  ifelse(length(y) == 0, NA,
         ifelse(length(y) > 1, paste(unlist(y), collapse=" "), y))
}

# Final function
get_article_meta <- function(xmlfile, ...) {#Assign the document variable and specify the node containing needed data
  dat <- xmlParse(file = xmlfile)
  nodes<- getNodeSet(dat, "//PubmedArticle")
  
  #Extract the needed objects out of the XML structure by the custom function above
  pmid <- sapply(nodes, xpath2, "./MedlineCitation/PMID")
  journal_name <- sapply(nodes, xpath2, 
                         "./MedlineCitation/MedlineJournalInfo/MedlineTA")
  journal_NlmUniqueID <- sapply(nodes, xpath2, 
                                "./MedlineCitation/MedlineJournalInfo/NlmUniqueID")
  journal_ISSN <- sapply(nodes, xpath2, "./MedlineCitation/Article/Journal/ISSN")
  year <- sapply(nodes, xpath2, 
                 "./MedlineCitation/Article/Journal/JournalIssue/PubDate/Year")
  month <- sapply(nodes, xpath2, 
                  "./MedlineCitation/Article/Journal/JournalIssue/PubDate/Month")
  abstract <- sapply(nodes, xpath2, "./MedlineCitation/Article/Abstract/AbstractText")
  title <- sapply(nodes, xpath2, "./MedlineCitation/Article/ArticleTitle")
  
  #Convert the objects to dataframes
  df1 <- data.frame(matrix(unlist(pmid), nrow=length(pmid), 
                           byrow=T),stringsAsFactors=FALSE)
  df1$row <- as.numeric(rownames(df1))
  df2 <- data.frame(matrix(unlist(journal_name), nrow=length(journal_name), 
                           byrow=T),stringsAsFactors=FALSE)
  df2$row <- as.numeric(rownames(df2))
  df3 <- data.frame(matrix(unlist(journal_NlmUniqueID), 
                           nrow=length(journal_NlmUniqueID), 
                           byrow=T),stringsAsFactors=FALSE)
  df3$row <- as.numeric(rownames(df3))
  df4 <- data.frame(matrix(unlist(journal_ISSN), nrow=length(journal_ISSN), 
                           byrow=T),stringsAsFactors=FALSE)
  df4$row <- as.numeric(rownames(df4))
  df5 <- data.frame(matrix(unlist(year), nrow=length(year), 
                           byrow=T),stringsAsFactors=FALSE)
  df5$row <- as.numeric(rownames(df5))
  df6 <- data.frame(matrix(unlist(month), nrow=length(month), 
                           byrow=T),stringsAsFactors=FALSE)
  df6$row <- as.numeric(rownames(df6))
  df7 <- data.frame(matrix(unlist(abstract), nrow=length(abstract), 
                           byrow=T),stringsAsFactors=FALSE)
  df7$row <- as.numeric(rownames(df7))
  df8 <- data.frame(matrix(unlist(title), nrow=length(title), 
                           byrow=T),stringsAsFactors=FALSE)
  df8$row <- as.numeric(rownames(df8))
  
  #Integrate the dataframes above
  DF <- merge(df1, df2, by = "row")
  DF <- merge(DF, df3, by = "row")
  DF <- merge(DF, df4, by = "row")
  DF <- merge(DF, df5, by = "row")
  DF <- merge(DF, df6, by = "row")
  DF <- merge(DF, df7, by = "row")
  DF <- merge(DF, df8, by = "row")
  
  
  #Assign column names
  names(DF) <- c("row","pmid","journal_name","journal_NlmUniqueID",
                 "journal_ISSN","year","month", "abstract", "title")
  return(DF)}

### Now apply the future package to parallel the work (Our code below is based on LSF)
library(future)

## Original Source: https://github.com/hbs-rcs/sample_code/blob/master/R/R_parallel.R

## If using multiple cores on the grid is still too slow we can use multiple
## nodes on the compute grid. See https://grid.rcs.hbs.org/ for information.
## NOTE: this will only work on systems (like the HBS grid) with LSF available.

library(future.batchtools)
library(future.apply)

## Download template to the working directory:

download.file("https://raw.githubusercontent.com/mllg/batchtools/master/inst/templates/lsf-simple.tmpl", "lsf-simple.tmpl")

# Plan to use lsf (note that walltime is in seconds)
plan(batchtools_lsf,
     template = "lsf-simple.tmpl",
     resources = list(walltime = 300, memory = "1.4G", queue = "short"),
     workers = 715)

## Use same code as before, but now runs on LSF nodes instead of local CPU
## cores!

#Now use future_lapply to loop through all xml files
all_files <- read.table("data_cleaning/raw_data/files.txt")
all_files <- as.vector(all_files$V1)

start.time = Sys.time()
article_meta <- bind_rows(lapply(all_files, get_article_meta))
article_meta <- subset(article_meta, select = -row)

end.time = Sys.time()
time.taken = round(end.time - start.time, 2)
time.taken

#Write the df out as a csv
write.csv(article_meta, "data_cleaning/intermediate_data/article_meta.csv")

################################################################################
###      Extract all needed PubMed author info and 
###      write the result as the author_meta csv file
################################################################################

library(xml2)

### Construct the scraping functions
## scrape one author name from one publication
get_1_name <- function(author1) {
  
  if(length(author1[["ForeName"]]) > 0) {
    if(length(author1[["LastName"]]) > 0) {
      #last name
      last_name <- author1[["LastName"]][[1]]
      #first name
      first_name <- author1[["ForeName"]][[1]]
      cbind(last_name, first_name)
    } else {
      first_name <- author1[["ForeName"]][[1]]
      cbind(NA, first_name)
    }
  } else {
    if(length(author1[["LastName"]]) > 0) {
      last_name <- author1[["LastName"]][[1]]
      cbind(last_name, NA)
    } else {
      cbind(NA, NA)
    }
  }
}

## Now write a function to get all names from one article
get_names <- function(article) {
  PMID <- article[["MedlineCitation"]][["PMID"]][[1]]
  authorList <- article[["MedlineCitation"]][["Article"]][["AuthorList"]]
  
  author_num <- length(article[["MedlineCitation"]][["Article"]][["AuthorList"]])
  if(author_num == 0) {
    cbind(PMID, NA, NA)
  } else {
    Names <- do.call(rbind, lapply(authorList[1:author_num], get_1_name))
    cbind(matrix(rep(PMID, nrow(Names)), ncol = 1), Names)
  }
}

## write function to apply to all articles in a file (the final function)
get_authors <- function(file) {
  tryCatch({
    doc <- read_xml(file)
    #Turn the whole xml file into r list to avoid 
    #further expensive operations between xml and r list (different Objects)
    doc <- as_list(doc)
    article_list <- doc[["PubmedArticleSet"]]
    art_num <- length(doc[['PubmedArticleSet']])
    out <- lapply(article_list[1:art_num],
                  get_names
    )
    out <- do.call(rbind, out)
    colnames(out) <- c("PMID", "LastName", "FirstName")
    data.frame(out)
  }, error=function(e){cat("ERROR :",conditionMessage(e),"for i=",file,"\n")})
}


### Now parallel the jobs to scrape through all XML files

require(future)

## Original Source: https://github.com/hbs-rcs/sample_code/blob/master/R/R_parallel.R

## If using multiple cores on the grid is still too slow we can use multiple
## nodes on the compute grid. See https://grid.rcs.hbs.org/ for information.
## NOTE: this will only work on systems (like the HBS grid) with LSF available.

require(future.batchtools)
require(future.apply)
require(dplyr)

## Download template to the working directory:

download.file("https://raw.githubusercontent.com/mllg/batchtools/master/inst/templates/lsf-simple.tmpl", "lsf-simple.tmpl")

# Plan to use lsf (note that walltime is in seconds)
plan(batchtools_lsf,
     template = "lsf-simple.tmpl",
     resources = list(walltime = 84000, memory = "30G", queue = "short"),
     workers = 715)

#Now use future_lapply to loop through all xml files
all_files <- read.table("data_cleaning/raw_data/files.txt")
all_files <- as.vector(all_files$V1)
#used to split the jobs
all_files <- all_files[1:715]

start.time = Sys.time()

author_meta <- bind_rows(future_lapply(all_files, get_authors))
article_meta <- subset(article_meta, select = -row)

end.time = Sys.time()
time.taken = round(end.time - start.time, 2)
time.taken

#Write the df out as a csv
write.csv(author_meta, "data_cleaning/intermeidate_data/author_meta.csv")

################################################################################
###      Extract all needed PubMed MeSH info and 
###      write the result as the mesh_meta csv file
################################################################################

### Construct the scraping functions
#The only input is the xml file name
mesh <- function(file,...){
  
  #empty vectors to fill, they will finally have the same length to be merged as a dataframe
  PMID <- NULL
  DescriptorName <- NULL
  DescriptorID <- NULL
  Descriptor_MajorTopic <- NULL
  QualifierName <- NULL
  QualifierID <- NULL
  Qualifier_MajorTopic <- NULL
  
  #convert the xml file to a big r list
  xml_doc <- read_xml(file)
  c <- as_list(xml_doc)
  
  #scrape the number of articles in the fil
  art_num <- length(c[["PubmedArticleSet"]])
  
  for (i in 1:art_num) {
    #used to count the number of times needed to replicate the same pmid to construct 
    #the 3-level "pmid -> descriptor name -> qualifier name" longitudinal structure
    pmid_cnt <- 0
    
    #unique pmid associated with an article, descpt_num gives the number of descriptor names in the article
    pmid <- c[["PubmedArticleSet"]][[i]][["MedlineCitation"]][["PMID"]][[1]]
    
    #every meshheadinglist can only have one descriptor name
    descpt_num <- length(c[["PubmedArticleSet"]][[i]][["MedlineCitation"]][["MeshHeadingList"]])
    
    #articles can have 0 to many descriptor names
    if (descpt_num == 0) {
      DescriptorName <- c(DescriptorName, "NULL")
      DescriptorID <- c(DescriptorID, "NULL")
      Descriptor_MajorTopic <- c(Descriptor_MajorTopic, "NULL")
      QualifierName <- c(QualifierName, "NULL")
      QualifierID <- c(QualifierID, "NULL")
      Qualifier_MajorTopic <- c(Qualifier_MajorTopic, "NULL")
      PMID <- c(PMID, pmid)
    } #if the article does have descriptor name(s), loop through every descriptor name sublist
    else {
      
      for (j in 1:descpt_num) {
        #get descriptor name under each meshheadinglist
        descpt_name <- c[["PubmedArticleSet"]][[i]][["MedlineCitation"]][["MeshHeadingList"]][[j]][[1]][[1]]
        descpt_id <- attr(c[["PubmedArticleSet"]][[i]][["MedlineCitation"]][["MeshHeadingList"]][[j]][[1]], which = "UI")
        descpt_major <- attr(c[["PubmedArticleSet"]][[i]][["MedlineCitation"]][["MeshHeadingList"]][[j]][[1]], which = "MajorTopicYN")
        
        #each meshheadinglist can have exactly one descriptor name and 0 to many qualifier names
        qualfr_num <- length(c[["PubmedArticleSet"]][[i]][["MedlineCitation"]][["MeshHeadingList"]][[j]]) - 1
        
        #if no qualifier found associated with this descriptor name, no need to replicate descriptor names to make
        #the "descriptor -> qualifier" longitudinal structure
        if(qualfr_num == 0) {
          DescriptorName <- c(DescriptorName, descpt_name)
          DescriptorID <- c(DescriptorID, descpt_id)
          Descriptor_MajorTopic <- c(Descriptor_MajorTopic, descpt_major)
          QualifierName <- c(QualifierName, "NULL")
          QualifierID <- c(QualifierID, "NULL")
          Qualifier_MajorTopic <- c(Qualifier_MajorTopic, "NULL")
          pmid_cnt <- pmid_cnt + 1
        } #if qualifiers found, make the panel structure by replicating the corresponding descriptor name
        else {
          for (k in 2:(qualfr_num + 1)) {
            QualifierName <- c(QualifierName, c[["PubmedArticleSet"]][[i]][["MedlineCitation"]][["MeshHeadingList"]][[j]][[k]][[1]])
            QualifierID <- c(QualifierID, attr(c[["PubmedArticleSet"]][[i]][["MedlineCitation"]][["MeshHeadingList"]][[j]][[k]], which = "UI"))
            Qualifier_MajorTopic <- c(Qualifier_MajorTopic, attr(c[["PubmedArticleSet"]][[i]][["MedlineCitation"]][["MeshHeadingList"]][[j]][[k]], which = "MajorTopicYN"))
          }
          DescriptorName <- c(DescriptorName, rep(descpt_name, qualfr_num))
          DescriptorID <- c(DescriptorID, rep(descpt_id, qualfr_num))
          Descriptor_MajorTopic <- c(Descriptor_MajorTopic, rep(descpt_major, qualfr_num))
          pmid_cnt <- pmid_cnt + qualfr_num
        }
      }
    }
    #now know how many times to replicate the pmid
    PMID <- c(PMID, rep(pmid, pmid_cnt))
  }
  #combine the vectors to a dataframe
  return(df <- data.frame(PMID, DescriptorName, DescriptorID, Descriptor_MajorTopic, QualifierName, QualifierID, Qualifier_MajorTopic))
}


### Parallel the jobs

require(future)

## Original Source: https://github.com/hbs-rcs/sample_code/blob/master/R/R_parallel.R

## If using multiple cores on the grid is still too slow we can use multiple
## nodes on the compute grid. See https://grid.rcs.hbs.org/ for information.
## NOTE: this will only work on systems (like the HBS grid) with LSF available.

require(future.batchtools)
require(future.apply)
require(dplyr)

## Download template to the working directory:

download.file("https://raw.githubusercontent.com/mllg/batchtools/master/inst/templates/lsf-simple.tmpl", "lsf-simple.tmpl")

# Plan to use lsf (note that walltime is in seconds)
plan(batchtools_lsf,
     template = "lsf-simple.tmpl",
     resources = list(walltime = 28800, memory = "50G", queue = "short"),
     workers = 715)

#Now use future_lapply to loop through all xml files
all_files <- read.table("data_cleaning/raw_data/files.txt")
all_files <- as.vector(all_files$V1)

start.time = Sys.time()

mesh_meta <- bind_rows(future_lapply(all_files, mesh))
article_meta <- subset(article_meta, select = -row)

end.time = Sys.time()
time.taken = round(end.time - start.time, 2)
time.taken

#Write the df out as a csv
write.csv(mesh_meta, "data_cleaning/intermeidate_data/mesh_meta.csv")


################################################################################
###            Extract publication type related information
################################################################################

# Extraction Process
# Custom function to extract info out of XML 
# (text can have multiple lines, I collapse them with a space character in the middle.)
xpath2 <-function(x, path, fun = xmlValue, ...){
  y <- xpathSApply(x, path, fun, ...)
  ifelse(length(y) == 0, NA,
         ifelse(length(y) > 1, paste(unlist(y), collapse=" "), y))
}

get_type_meta <- function(xmlfile, ...) {
  #Assign the document variable and specify the node containing needed data
  dat <- xmlParse(file = xmlfile)
  nodes<- getNodeSet(dat, "//PubmedArticle")
  
  #Extract the needed objects out of the XML structure by the custom function above
  pmid <- sapply(nodes, xpath2, "./MedlineCitation/PMID")
  type <- sapply(nodes, xpath2, 
                 "./MedlineCitation/Article/PublicationTypeList/PublicationType")
  abstract <- sapply(nodes, xpath2, "./MedlineCitation/Article/Abstract/AbstractText")
  title <- sapply(nodes, xpath2, "./MedlineCitation/Article/ArticleTitle")
  year <- sapply(nodes, xpath2, 
                 "./MedlineCitation/Article/Journal/JournalIssue/PubDate/Year")
  journal_name <- sapply(nodes, xpath2, 
                         "./MedlineCitation/MedlineJournalInfo/MedlineTA")
  
  #Convert the objects to dataframes
  df1 <- data.frame(matrix(unlist(pmid), nrow=length(pmid), 
                           byrow=T),stringsAsFactors=FALSE)
  df1$row <- as.numeric(rownames(df1))
  df2 <- data.frame(matrix(unlist(type), nrow=length(type), 
                           byrow=T),stringsAsFactors=FALSE)
  df2$row <- as.numeric(rownames(df2))
  df3 <- data.frame(matrix(unlist(abstract), 
                           nrow=length(abstract), 
                           byrow=T),stringsAsFactors=FALSE)
  df3$row <- as.numeric(rownames(df3))
  df4 <- data.frame(matrix(unlist(title), nrow=length(title), 
                           byrow=T),stringsAsFactors=FALSE)
  df4$row <- as.numeric(rownames(df4))
  df5 <- data.frame(matrix(unlist(year), nrow=length(year), 
                           byrow=T),stringsAsFactors=FALSE)
  df5$row <- as.numeric(rownames(df5))
  df6 <- data.frame(matrix(unlist(journal_name), nrow=length(journal_name), 
                           byrow=T),stringsAsFactors=FALSE)
  df6$row <- as.numeric(rownames(df6))
  
  #Integrate the dataframes above
  DF <- merge(df1, df2, by = "row")
  DF <- merge(DF, df3, by = "row")
  DF <- merge(DF, df4, by = "row")
  DF <- merge(DF, df5, by = "row")
  DF <- merge(DF, df6, by = "row")
  
  #Assign column names
  names(DF) <- c("row","pmid","type","abstract", "title","year", "journal_name")
  return(DF)
}

# Now apply the future package to parallel the work
library(future)

## Original Source: https://github.com/hbs-rcs/sample_code/blob/master/R/R_parallel.R

## If using multiple cores on the grid is still too slow we can use multiple
## nodes on the compute grid. See https://grid.rcs.hbs.org/ for information.
## NOTE: this will only work on systems (like the HBS grid) with LSF available.

library(future.batchtools)
library(future.apply)

## Download template to the working directory:

download.file("https://raw.githubusercontent.com/mllg/batchtools/master/inst/templates/lsf-simple.tmpl", "lsf-simple.tmpl")

# Plan to use lsf (note that walltime is in seconds)
plan(batchtools_lsf,
     template = "lsf-simple.tmpl",
     resources = list(walltime = 300, memory = "10G", queue = "short"),
     workers = 1000)

## Use same code as before, but now runs on LSF nodes instead of local CPU
## cores!

# get all xml file names
files <- as.vector(list.files(path = "./", pattern = ".xml"))

start.time = Sys.time()
# a = future_lapply(all_files, function(x) NULL)
#change to future_lapply if uncomment above
all_article_meta <- bind_rows(future_lapply(files, get_type_meta))
all_article_meta <- subset(all_article_meta, select = -row)

end.time = Sys.time()
time.taken = round(end.time - start.time, 2)
time.taken

#Write the df out as a csv
fwrite(all_article_meta, "data_cleaning/intermeidate_data/all_article_meta.csv")


################################################################################
###   Extract all MeSH tree numbers for each MeSH term from desc2020.xml
################################################################################

require(XML)
require(methods)
require(dplyr)
require(tidyr)

# Custom function to extract all Tree Numbers for each MeSH terms out of XML
xpath2 <-function(x, path, fun = xmlValue, ...){
  y <- xpathSApply(x, path, fun, ...)
  ifelse(length(y) == 0, NA,
         ifelse(length(y) > 1, paste(unlist(y), collapse=""), y))
}

# Final function to scrape the MeSH tree number info
get_descriptor <- function(xmlfile, ...) {#Assign the document variable and specify the node containing needed data
  dat <- xmlParse(file = xmlfile)
  nodes<- getNodeSet(dat, "//DescriptorRecord")
  
  #Extract the needed objects out of the XML structure by the custom function above
  UI <- sapply(nodes, xpath2, "./DescriptorUI")
  DescriptorName <- sapply(nodes, xpath2, 
                           "./DescriptorName")
  TreeNumber <- sapply(nodes, xpath2, 
                       "./TreeNumberList")
  
  #Convert the objects to dataframes
  df1 <- data.frame(matrix(unlist(UI), nrow=length(UI), 
                           byrow=T),stringsAsFactors=FALSE)
  df1$row <- as.numeric(rownames(df1))
  df2 <- data.frame(matrix(unlist(DescriptorName), nrow=length(DescriptorName), 
                           byrow=T),stringsAsFactors=FALSE)
  df2$row <- as.numeric(rownames(df2))
  df3 <- data.frame(matrix(unlist(TreeNumber), 
                           nrow=length(TreeNumber), 
                           byrow=T),stringsAsFactors=FALSE)
  df3$row <- as.numeric(rownames(df3))
  
  #Integrate the dataframes above
  DF <- merge(df1, df2, by = "row")
  DF <- merge(DF, df3, by = "row")
  
  
  #Assign column names
  names(DF) <- c("row","UI","DescriptorName","TreeNumber")
  return(DF)}

desc = get_descriptor("desc2020")

desc <- separate(a, TreeNumber, into = c('TreeNumber0', 'TreeNumber1', 'TreeNumber2', 'TreeNumber3', 'TreeNumber4', 'TreeNumber5', 'TreeNumber6', 'TreeNumber7', 'TreeNumber8', 
                                         'TreeNumber9', 'TreeNumber10', 'TreeNumber11', 'TreeNumber12', 'TreeNumber13', 'TreeNumber14', 'TreeNumber15', 
                                         'TreeNumber16', 'TreeNumber17', 'TreeNumber18', 'TreeNumber19', 'TreeNumber20'), sep = "(?=[A-Z])", fill = "right")
desc = select(test, -row, -TreeNumber0)
#Write the df out as a csv
write.csv(desc, "data_cleaning/intermeidate_data/descriptor_tree.csv")

################################################################################
###   Extract all clinical trial related information for text prediction
################################################################################

# Extraction Process
# Custom function to extract info out of XML 
# (text can have multiple lines, I collapse them with a space character in the middle.)
xpath2 <-function(x, path, fun = xmlValue, ...){
  y <- xpathSApply(x, path, fun, ...)
  ifelse(length(y) == 0, NA,
         ifelse(length(y) > 1, paste(unlist(y), collapse=" "), y))
}

get_clinical_meta <- function(xmlfile, ...) {
  # declear the right file directory path
  xmlfile <- paste("./clinical/", xmlfile, sep = "")
  
  #Assign the document variable and specify the node containing needed data
  dat <- xmlParse(file = xmlfile)
  nodes<- getNodeSet(dat, "//clinical_study")
  
  #Extract the needed objects out of the XML structure by the custom function above
  nct_id <- sapply(nodes, xpath2, "./id_info/nct_id")
  brief_title <- sapply(nodes, xpath2, 
                        "./brief_title")
  official_title <- sapply(nodes, xpath2, 
                           "./official_title")
  brief_sum <- sapply(nodes, xpath2, "./brief_summary/textblock")
  detail_des <- sapply(nodes, xpath2, 
                       "./detailed_description/textblock")
  overall_status <- sapply(nodes, xpath2, 
                           "./overall_status")
  start_date <- sapply(nodes, xpath2, 
                       "./start_date")
  completion_date <- sapply(nodes, xpath2, 
                            "./completion_date")
  primary_completion_date <- sapply(nodes, xpath2, 
                                    "./primary_completion_date")
  eligibility_criteria <- sapply(nodes, xpath2, 
                                 "./eligibility/criteria/textblock")
  trial_gender <- sapply(nodes, xpath2, 
                         "./eligibility/gender")
  primary_outcome_measure <- sapply(nodes, xpath2, 
                                    "./primary_outcome/measure")
  secondary_outcome_measure <- sapply(nodes, xpath2, 
                                      "./secondary_outcome/measure")
  primary_outcome_description <- sapply(nodes, xpath2, 
                                        "./primary_outcome/description")
  secondary_outcome_description <- sapply(nodes, xpath2, 
                                          "./secondary_outcome/description")
  condition <- sapply(nodes, xpath2, 
                      "./condition")
  keywords <- sapply(nodes, xpath2, 
                     "./keyword")
  
  #Convert the objects to dataframes
  df1 <- data.frame(matrix(unlist(nct_id), nrow=length(nct_id), 
                           byrow=T),stringsAsFactors=FALSE)
  df1$row <- as.numeric(rownames(df1))
  df2 <- data.frame(matrix(unlist(brief_title), nrow=length(brief_title), 
                           byrow=T),stringsAsFactors=FALSE)
  df2$row <- as.numeric(rownames(df2))
  df3 <- data.frame(matrix(unlist(official_title), 
                           nrow=length(official_title), 
                           byrow=T),stringsAsFactors=FALSE)
  df3$row <- as.numeric(rownames(df3))
  df4 <- data.frame(matrix(unlist(brief_sum), nrow=length(brief_sum), 
                           byrow=T),stringsAsFactors=FALSE)
  df4$row <- as.numeric(rownames(df4))
  df5 <- data.frame(matrix(unlist(detail_des), nrow=length(detail_des), 
                           byrow=T),stringsAsFactors=FALSE)
  df5$row <- as.numeric(rownames(df5))
  df6 <- data.frame(matrix(unlist(overall_status), nrow=length(overall_status), 
                           byrow=T),stringsAsFactors=FALSE)
  df6$row <- as.numeric(rownames(df6))
  df7 <- data.frame(matrix(unlist(start_date), nrow=length(start_date), 
                           byrow=T),stringsAsFactors=FALSE)
  df7$row <- as.numeric(rownames(df7))
  df8 <- data.frame(matrix(unlist(completion_date), nrow=length(completion_date), 
                           byrow=T),stringsAsFactors=FALSE)
  df8$row <- as.numeric(rownames(df8))
  df9 <- data.frame(matrix(unlist(primary_completion_date), nrow=length(primary_completion_date), 
                           byrow=T),stringsAsFactors=FALSE)
  df9$row <- as.numeric(rownames(df9))
  df10 <- data.frame(matrix(unlist(eligibility_criteria), nrow=length(eligibility_criteria), 
                            byrow=T),stringsAsFactors=FALSE)
  df10$row <- as.numeric(rownames(df10))
  df11 <- data.frame(matrix(unlist(trial_gender), nrow=length(trial_gender), 
                            byrow=T),stringsAsFactors=FALSE)
  df11$row <- as.numeric(rownames(df11))
  df12 <- data.frame(matrix(unlist(primary_outcome_measure), nrow=length(primary_outcome_measure), 
                            byrow=T),stringsAsFactors=FALSE)
  df12$row <- as.numeric(rownames(df12))
  df13 <- data.frame(matrix(unlist(secondary_outcome_measure), nrow=length(secondary_outcome_measure), 
                            byrow=T),stringsAsFactors=FALSE)
  df13$row <- as.numeric(rownames(df13))
  df14 <- data.frame(matrix(unlist(primary_outcome_description), 
                            nrow=length(primary_outcome_description), byrow=T),stringsAsFactors=FALSE)
  df14$row <- as.numeric(rownames(df14))
  df15 <- data.frame(matrix(unlist(secondary_outcome_description), 
                            nrow=length(secondary_outcome_description), byrow=T),stringsAsFactors=FALSE)
  df15$row <- as.numeric(rownames(df15))
  df16 <- data.frame(matrix(unlist(condition), nrow=length(condition), 
                            byrow=T),stringsAsFactors=FALSE)
  df16$row <- as.numeric(rownames(df16))
  df17 <- data.frame(matrix(unlist(keywords), nrow=length(keywords), 
                            byrow=T),stringsAsFactors=FALSE)
  df17$row <- as.numeric(rownames(df17))
  
  #Integrate the dataframes above
  DF <- merge(df1, df2, by = "row")
  DF <- merge(DF, df3, by = "row")
  DF <- merge(DF, df4, by = "row")
  DF <- merge(DF, df5, by = "row")
  DF <- merge(DF, df6, by = "row")
  DF <- merge(DF, df7, by = "row")
  DF <- merge(DF, df8, by = "row")
  DF <- merge(DF, df9, by = "row")
  DF <- merge(DF, df10, by = "row")
  DF <- merge(DF, df11, by = "row")
  DF <- merge(DF, df12, by = "row")
  DF <- merge(DF, df13, by = "row")
  DF <- merge(DF, df14, by = "row")
  DF <- merge(DF, df15, by = "row")
  DF <- merge(DF, df16, by = "row")
  DF <- merge(DF, df17, by = "row")
  
  
  #Assign column names
  names(DF) <- c("row","nct_id","brief_title","official_title",
                 "brief_sum","detail_des","overall_status", "start_date", 
                 "completion_date", "primary_completion_date", "eligibility_criteria", 
                 "trial_gender", "primary_outcome_measure", "secondary_outcome_measure", 
                 "primary_outcome_description", "secondary_outcome_description",
                 "condition", "keywords")
  return(DF)
}


# Now apply the future package to parallel the work
library(future)

## Original Source: https://github.com/hbs-rcs/sample_code/blob/master/R/R_parallel.R

## If using multiple cores on the grid is still too slow we can use multiple
## nodes on the compute grid. See https://grid.rcs.hbs.org/ for information.
## NOTE: this will only work on systems (like the HBS grid) with LSF available.

library(future.batchtools)
library(future.apply)

## Download template to the working directory:

download.file("https://raw.githubusercontent.com/mllg/batchtools/master/inst/templates/lsf-simple.tmpl", "lsf-simple.tmpl")

# Plan to use lsf (note that walltime is in seconds)
plan(batchtools_lsf,
     template = "lsf-simple.tmpl",
     resources = list(walltime = 300, memory = "500M", queue = "short"),
     workers = 150)

## Use same code as before, but now runs on LSF nodes instead of local CPU
## cores!

# get all xml file names (these files are NOT included in our replication package. 
# To get these data, see the data_cleaning folder's documentation.)
files <- as.vector(list.files(path = "./clinical/"))

start.time = Sys.time()
# a = future_lapply(all_files, function(x) NULL)
#change to future_lapply if uncomment above
clinical_meta <- bind_rows(future_lapply(files, get_clinical_meta))
clinical_meta <- subset(clinical_meta, select = -row)

end.time = Sys.time()
time.taken = round(end.time - start.time, 2)
time.taken

#Write the df out as a csv
write.csv(clinical_meta, "data_cleaning/intermeidate_data/clinical_meta.csv", row.names = F)